library("cluster")
library("dendextend")
##
## ---------------------
## Welcome to dendextend version 1.7.0
## Type citation('dendextend') for how to cite the package.
##
## Type browseVignettes(package = 'dendextend') for the package vignette.
## The github page is: https://github.com/talgalili/dendextend/
##
## Suggestions and bug-reports can be submitted at: https://github.com/talgalili/dendextend/issues
## Or contact: <tal.galili@gmail.com>
##
## To suppress this message use: suppressPackageStartupMessages(library(dendextend))
## ---------------------
##
## Attaching package: 'dendextend'
## The following object is masked from 'package:stats':
##
## cutree
source("functions.R")
## Loading required package: ggplot2
# Get data with Stylo
# data = stylo::load.corpus.and.parse(corpus.dir = "dh-meier-data/output/transkribus/tokenized/boudams/", features = "w", ngram.size = 1, preserve.case = FALSE)
# Get freq lists
#data = stylo::make.table.of.frequencies(corpus = data, features = unique(sort(unlist(data))), relative = FALSE)
# Write it
#write.csv(as.matrix(data), "data/transkr_expanded_words.csv")
data = read.csv("data/transkr_expanded_words.csv", header = TRUE, row.names = 1)
data = t(data)
nwords = colSums(data)
summary(nwords)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 298 2244 3539 5070 6774 18971
boxplot(nwords)
boxplot(nwords)$out
## 05_Ano_Leg-A_Ap_NA_Vie_Jacques 29_Wau_Leg-C_Co_Ev_Vie_Martin
## 17920 14432
## 31_Wau_Leg-C_Co_Ev_Dia_Martin3 34_Wau_Leg-C_Co_Ev_Vie_Martial
## 18971 15255
head(sort(nwords), n = 15)
## 03_Ano_Leg-A_Ap_NA_Mar_Jean 62_Ano_Leg-N_NA_NA_NA_Index
## 298 301
## 61_Ano_Leg-B_NA_NA_NA_Jugement 30_Wau_Leg-C_Co_Ev_Tra_Martin2
## 406 722
## 08_Ano_Leg-A_Ap_NA_Vie_Philippe 59_Ano_Leg-C_Vi_NA_Vie_Euphrasie
## 1014 1293
## 09_Ano_Leg-A_Ap_NA_Vie_JacquesMineur 32_Wau_Leg-C_Co_Ev_Vie_Brice
## 1356 1385
## 60_Ano_Leg-B_NA_NA_NA_Antechriste 54_Ano_Leg-C_Vi_NA_Vie_Pelagie
## 1485 1506
## 20_Ano_Leg-B_Ma_Fe_Vie_Felicite 11_Ano_Leg-A_Ap_NA_Vie_Marc
## 1676 1820
## 23_Ano_Leg-B_Ma_Ho_Vie_Sixte 53_Ano_Leg-C_Vi_NA_Vie_Marguerite
## 1894 1935
## 35_Wau_Leg-C_Co_Ev_Vie_Nicolas
## 1960
toKeep = colnames(data)[nwords > 1000]
toKeep = toKeep[grep("Bestiaire", toKeep, invert = TRUE)]
# Get data with Stylo
#data = stylo::load.corpus.and.parse(corpus.dir = "dh-meier-data/output/transkribus/raw/", features = "c", ngram.size = 3, preserve.case = FALSE)
# Get freq lists
#data = stylo::make.table.of.frequencies(corpus = data, features = unique(sort(unlist(data))), relative = FALSE)
# Write it
#write.csv(as.matrix(data), "data/transkr_raw_char3grams.csv")
data = read.csv("data/transkr_raw_char3grams.csv", header = TRUE, row.names = 1)
data = t(data)
data = data[, toKeep]
data = data[rowSums(data) > 0, ]
d = data
# Selection based on Moisl 2011
select = selection(d, z = 1.645)
select = select[,4]
# Normalisations
d = relativeFreqs(d)
# save data for robustness checks
Raw3grSave = d
d = d[select,]
d = normalisations(d)
myCAH = cluster::agnes(t(d), metric = "manhattan", method="ward")
# Save
CAHRaw3gr = myCAH
#TODO: heights
# barplot(sort(myCAH$height))
plotRaw3grams = cahPlotCol(myCAH, k = 9, main = "Characters 3-grams from raw data (Transkr)")
somCAH = somCluster(d)
somplotRaw3grams = cahPlotCol(somCAH, k = 9, main = "SOM BASED - Characters 3-grams from raw data (Transkr)")
data = read.csv("data/transkr_expanded_words.csv", header = TRUE, row.names = 1)
data = t(data)
data = data[, toKeep]
data = data[rowSums(data) > 0, ]
d = data
# Selection based on Moisl 2011
select = selection(d, z = 1.645)
select = select[,4]
# Normalisations
d = relativeFreqs(d)
# save data for robustness checks
WordsSave = d
d = d[select,]
d = normalisations(d)
myCAH = cluster::agnes(t(d), metric = "manhattan", method="ward")
# Save
CAHForms = myCAH
#TODO: heights
# barplot(sort(myCAH$height))
plotForms = cahPlotCol(myCAH, k = 9, main = "Expanded word forms (Transkr/Boudams/Pie)")
somCAH = somCluster(d)
somplotForms = cahPlotCol(somCAH, k = 9, main = "SOM BASED - Expanded word forms (Transkr/Boudams/Pie)")
# Creating affixes database from all words
dataAffs = countAffixes(data)
d = dataAffs
# Selection based on Moisl 2011
select = selection(d, z = 1.645)
select = select[,4]
# Normalisations
d = relativeFreqs(d)
d = d[select,]
AffixesSave = d
d = normalisations(d)
myCAH = cluster::agnes(t(d), metric = "manhattan", method="ward")
# Save
CAHAffs = myCAH
#TODO: heights
# barplot(sort(myCAH$height))
plotAffixes = cahPlotCol(myCAH, k = 9, main = "Expanded affixes (Transkr/Boudams/Pie)")
somCAH = somCluster(d)
somplotAffixes = cahPlotCol(somCAH, k = 9, main = "SOM BASED - Expanded affixes (Transkr/Boudams/Pie)")
#labels(sort(rowSums(data), decreasing = TRUE)[1:300])
# Avec ou sans pronoms ?
functionWords = source("functionWords.R")$value
d = relativeFreqs(data)
d = d[functionWords,]
# save data for robustness checks
FWSave = d
d = normalisations(d)
myCAH = cluster::agnes(t(d), metric = "manhattan", method="ward")
# Save
CAHFW = myCAH
# barplot(sort(myCAH$height))
plotFW = cahPlotCol(myCAH, k = 8, main = "Function words with pronouns and auxiliaries\n(Transkr/Boudams/Pie)")
#plotCol(myCAH, main = "toto")
somCAH = somCluster(d)
somplotFW = cahPlotCol(somCAH, k = 9, main = "SOM BASED - Function words")
data = read.csv("data/transkr_pos3-gr.csv", header = TRUE, row.names = 1, sep = ";")
#remove total freq
data = data[, -1]
colnames(data) = gsub("^X", "", colnames(data))
colnames(data) = gsub(".decolumnized", "", colnames(data))
colnames(data) = gsub("Leg.", "Leg-", colnames(data))
data = data[, toKeep]
data = data[rowSums(data) > 0, ]
data = as.matrix(data)
d = data
# Selection based on Moisl 2011
select = selection(d, z = 1.645)
select = select[,4]
# Normalisations
d = relativeFreqs(d)
# save data for robustness checks
d = d[select,]
POS3grSave = d
d = normalisations(d)
myCAH = cluster::agnes(t(d), metric = "manhattan", method="ward")
# Save
CAHPOS3gr = myCAH
#TODO: heights
# barplot(sort(myCAH$height))
plotPOS3grams = cahPlotCol(myCAH, k = 9, main = "POS 3-grams (Transkr/Boudams/Pie/Pie)")
somCAH = somCluster(d)
somplotPOS3grams = cahPlotCol(somCAH, k = 9, main = "SOM BASED - POS 3-grams")
data = read.csv("data/transkr_lemmas.csv", header = TRUE, row.names = 1, sep = ";")
#remove total freq
data = data[, -1]
colnames(data) = gsub("^X", "", colnames(data))
colnames(data) = gsub(".decolumnized", "", colnames(data))
colnames(data) = gsub("Leg.", "Leg-", colnames(data))
data = data[, toKeep]
data = data[rowSums(data) > 0, ]
data = as.matrix(data)
d = data
# Selection based on Moisl 2011
select = selection(d, z = 1.645)
select = select[,4]
# Normalisations
d = relativeFreqs(d)
d = d[select,]
LemmasSave = d
d = normalisations(d)
myCAH = cluster::agnes(t(d), metric = "manhattan", method="ward")
# Save
CAHLemmas = myCAH
#TODO: heights
# barplot(sort(myCAH$height))
plotLemmas = cahPlotCol(myCAH, k = 9, main = "Lemmas (Transkr/Boudams/Pie/Pie)")
somCAH = somCluster(d)
somplotLemmas = cahPlotCol(somCAH, k = 9, main = "SOM BASED - Lemmas")
# Find function words
#rownames(data)[1:250]
functionLemmas = source("functionLemmas.R")$value
d = relativeFreqs(data)
d = d[functionLemmas,]
FLSave = d
d = normalisations(d)
myCAH = cluster::agnes(t(d), metric = "manhattan", method="ward")
# Save
CAHFL = myCAH
# barplot(sort(myCAH$height))
plotFL = cahPlotCol(myCAH, k = 8, main = "Function Lemmas with pronouns and auxiliaries\n(Transkr/Boudams/Pie)")
#plotCol(myCAH, main = "toto")
somCAH = somCluster(d)
somplotFL = cahPlotCol(somCAH, k = 9, main = "SOM BASED - Function words (lemmas)")
data = rbind(AffixesSave, POS3grSave, FLSave)
d = normalisations(data)
myCAH = cluster::agnes(t(d), metric = "manhattan", method="ward")
# Save
CAHGlob = myCAH
#TODO: heights
# barplot(sort(myCAH$height))
plotGlob = cahPlotCol(myCAH, k = 9, main = "Affixes + POS 3- grams + Function words (lemmas)")
somCAH = somCluster(d)
somplotGlob = cahPlotCol(somCAH, k = 9, main = "SOM BASED - Affixes + POS 3- grams + Function words (lemmas)")
#featlabel = "features of ME ±2σ with conf. > 90%"
#A = cahPlotCol(CAHLemma, main = "A", xlab = paste( ncol(CAHLemma$data), featlabel), k = 6, lrect = -12)
# B = cahPlotCol(CAHRhyme, main = "B", xlab = paste( ncol(CAHRhyme$data), featlabel), k = 6, lrect = -7, ylab = " ")
# C = cahPlotCol(CAHAllWords, main = "C", xlab = paste( ncol(CAHAllWords$data), featlabel), k = 6, ylab = " ")
# D = cahPlotCol(CAHAffs, main = "D", xlab = paste( ncol(CAHAffs$data), featlabel), k = 6, ylab = " ")
# E = cahPlotCol(CAHPOS3gr, main = "E", xlab = paste( ncol(CAHPOS3gr$data), featlabel), k = 6, lrect = -12 , ylab = " ")
# F = cahPlotCol(CAHmfw, main = "F", k = 6, lrect = -5, ylab = " ")
# gridExtra::grid.arrange(A, B, C, D, E, F, ncol = 2)
gridExtra::grid.arrange(plotRaw3grams, plotForms, plotAffixes, plotFW, plotLemmas, plotFL, plotPOS3grams, plotGlob, ncol = 2)
gridExtra::grid.arrange(somplotRaw3grams, somplotForms, somplotAffixes, somplotFW, somplotLemmas, somplotFL, somplotPOS3grams, somplotGlob, ncol = 2)
cahList = list(raw3grams = CAHRaw3gr, Forms = CAHForms, Affs = CAHAffs, FW = CAHFW, Lemmas = CAHLemmas, FunctLemm = CAHFL, POS3gr = CAHPOS3gr, Global = CAHGlob)
compareHC(cahList, k = 9)
## raw3grams Forms Affs FW Lemmas FunctLemm
## raw3grams 1.0000000 0.8135593 0.8305085 0.7966102 0.7288136 0.7288136
## Forms 0.8135593 1.0000000 0.6949153 0.7288136 0.6440678 0.6779661
## Affs 0.8644068 0.7118644 1.0000000 0.7796610 0.7457627 0.7118644
## FW 0.8135593 0.7288136 0.7627119 1.0000000 0.6949153 0.7627119
## Lemmas 0.7288136 0.6440678 0.7457627 0.7118644 1.0000000 0.6440678
## FunctLemm 0.7288136 0.6779661 0.6949153 0.7966102 0.6779661 1.0000000
## POS3gr 0.6779661 0.6779661 0.6440678 0.7627119 0.6440678 0.6610169
## Global 0.8305085 0.8135593 0.8135593 0.8135593 0.6610169 0.7966102
## POS3gr Global
## raw3grams 0.7288136 0.8135593
## Forms 0.6779661 0.7966102
## Affs 0.7288136 0.8474576
## FW 0.7627119 0.7627119
## Lemmas 0.6779661 0.6440678
## FunctLemm 0.7118644 0.7966102
## POS3gr 1.0000000 0.7627119
## Global 0.8135593 1.0000000